Exploratory Data Analysis: Data & Analytics Job Market

Code
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"

Research Question: What does the job market look like for Business Analytics, Data Science, and ML professionals in 2024?

What we’re Looking into: 1. Who’s hiring? → Top industries (barplot) and companies (treemap) 2. What roles exist? → Job titles within our occupation categories 3. What do they say? → Word cloud from job descriptions 4. What skills do they want? → Radar charts for each occupation + software skills barplot 5. What drives salary? → Salary by remote work type 6. Where are jobs posted? → Source types analysis

Each insight builds toward our ML modeling decisions.

Code
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import Counter
from wordcloud import WordCloud
import re

pd.set_option('display.max_columns', None)

# Load data
df = pd.read_csv('data/lightcast_cleaned.csv')
df['POSTED'] = pd.to_datetime(df['POSTED'])

# Exclude unclassified/unknown values for cleaner analysis
df_clean = df[~df['NAICS_2022_2_NAME'].str.contains('Unclassified', na=False)].copy()
df_clean = df_clean[~df_clean['COMPANY_NAME'].str.contains('Unclassified', case=False, na=False)].copy()
df_clean = df_clean[~df_clean['TITLE_NAME'].str.contains('Unclassified', case=False, na=False)].copy()

print(f"Dataset: {len(df):,} job postings")
print(f"After filtering unclassified: {len(df_clean):,} job postings")
print(f"Date range: {df['POSTED'].min().strftime('%b %Y')} - {df['POSTED'].max().strftime('%b %Y')}")
print(f"Occupations: {df['LOT_V6_OCCUPATION_NAME'].nunique()}")
Dataset: 55,917 job postings
After filtering unclassified: 46,713 job postings
Date range: May 2024 - Sep 2024
Occupations: 4

Who’s Hiring?

Code
# Top 10 Industries - Horizontal Bar Plot
industry_counts = df_clean['NAICS_2022_2_NAME'].value_counts().head(10)

fig = go.Figure()
fig.add_trace(go.Bar(
    y=industry_counts.index,
    x=industry_counts.values,
    orientation='h',
    marker_color='steelblue',
    text=industry_counts.values,
    textposition='outside'
))

fig.update_layout(
    title='Top 10 Industries Hiring Data Professionals',
    xaxis_title='Number of Job Postings',
    yaxis={'categoryorder': 'total ascending'},
    template='plotly_white',
    height=450,
    margin=dict(r=80)
)
fig.write_image('figures/top_industries.png', scale=2)
fig.show()

# Top 10 Companies - Treemap visualization
company_counts = df_clean['COMPANY_NAME'].value_counts().head(10).reset_index()
company_counts.columns = ['Company', 'Postings']

fig2 = px.treemap(
    company_counts,
    path=['Company'],
    values='Postings',
    title='Top 10 Companies Hiring Data Professionals',
    color='Postings',
    color_continuous_scale='Viridis'
)
fig2.update_layout(height=500, template='plotly_white')
fig2.update_traces(textinfo='label+value+percent root')
fig2.write_image('figures/top_companies_treemap.png', scale=2)
fig2.show()

print("\nTop 10 Companies by Job Postings:")
for i, row in company_counts.iterrows():
    pct = row['Postings'] / len(df_clean) * 100
    print(f"  {i+1}. {row['Company']}: {row['Postings']:,} postings ({pct:.1f}%)")

Top 10 Companies by Job Postings:
  1. Deloitte: 2,271 postings (4.9%)
  2. Accenture: 1,316 postings (2.8%)
  3. PricewaterhouseCoopers: 697 postings (1.5%)
  4. Insight Global: 355 postings (0.8%)
  5. Cardinal Health: 346 postings (0.7%)
  6. Smx Corporation Limited: 317 postings (0.7%)
  7. Oracle: 311 postings (0.7%)
  8. Robert Half: 308 postings (0.7%)
  9. Randstad: 285 postings (0.6%)
  10. Lumen Technologies: 267 postings (0.6%)

What Roles can you Go for?

Code
occupations = df_clean['LOT_V6_OCCUPATION_NAME'].unique()
colors = {'Data / Data Mining Analyst': '#1f77b4', 
          'Business Intelligence Analyst': '#ff7f0e', 
          'Business / Management Analyst': '#2ca02c', 
          'Market Research Analyst': '#d62728'}

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=[f"{occ}" for occ in occupations],
    horizontal_spacing=0.15,
    vertical_spacing=0.25
)

for idx, occ in enumerate(occupations):
    row = idx // 2 + 1
    col = idx % 2 + 1
    
    df_occ = df_clean[df_clean['LOT_V6_OCCUPATION_NAME'] == occ]
    df_occ = df_occ[~df_occ['TITLE_NAME'].str.contains('Unclassified', case=False, na=False)]
    top_titles = df_occ['TITLE_NAME'].value_counts().head(8)
    
    truncated_titles = [t[:28] + '...' if len(t) > 28 else t for t in top_titles.index]
    
    fig.add_trace(
        go.Bar(
            y=truncated_titles,
            x=top_titles.values,
            orientation='h',
            marker_color=colors.get(occ, '#636EFA'),
            text=top_titles.values,
            textposition='outside',
            name=occ,
            showlegend=False
        ),
        row=row, col=col
    )
    
    fig.update_xaxes(title_text="Count", row=row, col=col)
    fig.update_yaxes(categoryorder='total ascending', row=row, col=col)

fig.update_layout(
    title_text='Top 8 Job Titles per Occupation Category (Excl. Unclassified)',
    height=800,
    template='plotly_white',
    margin=dict(l=20, r=80, t=80, b=20)
)
fig.write_image('figures/job_titles_by_occupation.png', scale=2)
fig.show()

print("\nPostings per Occupation:")
for occ, count in df_clean['LOT_V6_OCCUPATION_NAME'].value_counts().items():
    print(f"  • {occ}: {count:,} ({count/len(df_clean)*100:.1f}%)")

Postings per Occupation:
  • Data / Data Mining Analyst: 22,352 (47.8%)
  • Business Intelligence Analyst: 21,244 (45.5%)
  • Business / Management Analyst: 2,999 (6.4%)
  • Market Research Analyst: 118 (0.3%)

Wordcloud of Job Descriptions

Code
# Using a sample of 3000 rows to prevent crashes

custom_stopwords = {
    'the', 'and', 'to', 'of', 'a', 'in', 'for', 'is', 'on', 'that', 'by', 'this',
    'with', 'are', 'be', 'as', 'at', 'from', 'or', 'an', 'will', 'your', 'you',
    'we', 'our', 'have', 'has', 'it', 'their', 'all', 'can', 'been', 'would',
    'who', 'more', 'if', 'about', 'which', 'when', 'what', 'into', 'also',
    'may', 'other', 'its', 'than', 'should', 'such', 'any', 'these', 'only',
    'new', 'well', 'them', 'they', 'but', 'not', 'do', 'up', 'out', 'so',
    'job', 'position', 'apply', 'applicant', 'employer', 'employment',
    'equal', 'opportunity', 'eeo', 'affirmative', 'action', 'disability',
    'race', 'color', 'religion', 'sex', 'national', 'origin', 'age',
    'status', 'protected', 'discrimination', 'including', 'without', 'regard',
    'com', 'www', 'http', 'https', 'click', 'here', 'learn', 'please', 'contact',
    'must', 'work', 'working', 'experience', 'years', 'year', 'required',
    'requirements', 'skills', 'ability', 'strong', 'excellent', 'good',
    'team', 'company', 'business', 'including', 'within', 'across', 'using'
}

df_sample = df_clean.sample(n=min(3000, len(df_clean)), random_state=42)
body_text = ' '.join(df_sample['BODY'].dropna().astype(str).tolist())

body_text = re.sub(r'[^a-zA-Z\s]', ' ', body_text.lower())
body_text = re.sub(r'\s+', ' ', body_text)

wordcloud = WordCloud(
    width=1200, 
    height=600,
    background_color='white',
    stopwords=custom_stopwords,
    max_words=100,
    colormap='viridis',
    collocations=False,
    random_state=42
).generate(body_text)

plt.figure(figsize=(14, 7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud: Common Terms in Job Descriptions', fontsize=18, fontweight='bold', pad=15)
plt.tight_layout()
plt.savefig('figures/job_description_wordcloud.png', dpi=200, bbox_inches='tight')
plt.show()

print(f"Word cloud generated from {len(df_sample):,} sampled job postings")

Word cloud generated from 3,000 sampled job postings

Skills to Look For

Code
def extract_skills(skills_series):
    all_skills = []
    for skills in skills_series.dropna():
        if isinstance(skills, str) and skills not in ['Not Listed', '']:
            all_skills.extend([s.strip() for s in skills.split(',')])
    return Counter(all_skills)

# Shorten skill names for better display
def shorten_skill(skill):
    replacements = {
        'SQL (Programming Language)': 'SQL',
        'Microsoft Excel': 'Excel',
        'Microsoft Power BI': 'Power BI',
        'Computer Science': 'Computer Sci',
        'Problem Solving': 'Problem Solving',
        'Data Visualization': 'Data Viz',
        'Business Intelligence': 'Business Intel',
        'Project Management': 'Project Mgmt',
        'Data Management': 'Data Mgmt',
        'Business Development': 'Business Dev',
        'Customer Service': 'Customer Svc',
        'Marketing Strategy': 'Mktg Strategy',
        'Market Research': 'Market Research',
        'Statistical Analysis': 'Statistics'
    }
    return replacements.get(skill, skill[:14] + '..' if len(skill) > 14 else skill)

occ_short = {
    'Business Intelligence Analyst': 'Business Intelligence',
    'Data / Data Mining Analyst': 'Data/Data Mining',
    'Business / Management Analyst': 'Business/Management',
    'Market Research Analyst': 'Market Research'
}

colors_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
occupations = df_clean['LOT_V6_OCCUPATION_NAME'].unique().tolist()

# Get top 8 skills for EACH occupation separately
skill_data = {}
for occ in occupations:
    df_occ = df_clean[df_clean['LOT_V6_OCCUPATION_NAME'] == occ]
    occ_skills = extract_skills(df_occ['SKILLS_NAME'])
    top_8 = occ_skills.most_common(8)
    total = len(df_occ)
    skill_data[occ] = {
        'skills': [s[0] for s in top_8],
        'values': [(s[1] / total * 100) for s in top_8]
    }

fig = make_subplots(
    rows=2, cols=2,
    specs=[[{'type': 'polar'}, {'type': 'polar'}],
           [{'type': 'polar'}, {'type': 'polar'}]],
    subplot_titles=[occ_short.get(occ, occ) for occ in occupations],
    vertical_spacing=0.18,
    horizontal_spacing=0.12
)

for idx, occ in enumerate(occupations):
    row = idx // 2 + 1
    col = idx % 2 + 1
    
    skills = skill_data[occ]['skills']
    values = skill_data[occ]['values']
    
    # Shorten labels and close the polygon
    short_labels = [shorten_skill(s) for s in skills]
    values_closed = values + [values[0]]
    labels_closed = short_labels + [short_labels[0]]
    
    fig.add_trace(
        go.Scatterpolar(
            r=values_closed,
            theta=labels_closed,
            fill='toself',
            name=occ_short.get(occ, occ),
            line_color=colors_list[idx],
            fillcolor=colors_list[idx],
            opacity=0.5,
            showlegend=False
        ),
        row=row, col=col
    )

# Update polar subplots
fig.update_polars(
    radialaxis=dict(
        visible=True,
        tickfont=dict(size=9),
        range=[0, 85]
    ),
    angularaxis=dict(
        tickfont=dict(size=10),
        rotation=90,
        direction='clockwise'
    )
)

fig.update_layout(
    title=dict(
        text='Top 8 Skills by Occupation (Each with Own Top Skills)',
        font=dict(size=16),
        y=0.98
    ),
    height=750,
    template='plotly_white',
    margin=dict(t=80, b=30, l=60, r=60)
)

# Adjust subplot title positions
for annotation in fig['layout']['annotations']:
    annotation['font'] = dict(size=12, color='#333')
    annotation['y'] = annotation['y'] + 0.02

fig.write_image('figures/skills_radar_by_occupation.png', scale=2)
fig.show()

# Print the top skills for each occupation
print("\nTop 8 Skills by Occupation:")
for occ in occupations:
    print(f"\n{occ_short.get(occ, occ)}:")
    for skill, val in zip(skill_data[occ]['skills'], skill_data[occ]['values']):
        print(f"  • {skill}: {val:.1f}%")

# Software skills - Horizontal Bar Chart
software_counts = extract_skills(df_clean['SOFTWARE_SKILLS_NAME'])
top_software = pd.DataFrame(software_counts.most_common(12), columns=['Software', 'Count'])
top_software['Percentage'] = (top_software['Count'] / len(df_clean) * 100).round(1)

fig2 = go.Figure()
fig2.add_trace(go.Bar(
    y=top_software['Software'],
    x=top_software['Percentage'],
    orientation='h',
    marker_color='mediumpurple',
    text=[f"{p}%" for p in top_software['Percentage']],
    textposition='outside'
))

fig2.update_layout(
    title='Top 12 Software/Technical Skills (% of Postings)',
    xaxis_title='% of Job Postings',
    yaxis={'categoryorder': 'total ascending'},
    template='plotly_white',
    height=450,
    margin=dict(r=80)
)
fig2.write_image('figures/software_skills.png', scale=2)
fig2.show()

Top 8 Skills by Occupation:

Business Intelligence:
  • Communication: 43.7%
  • SAP Applications: 35.8%
  • Management: 35.7%
  • Business Process: 29.5%
  • Business Requirements: 26.7%
  • Problem Solving: 25.4%
  • Finance: 23.4%
  • Consulting: 23.1%

Data/Data Mining:
  • Data Analysis: 76.5%
  • SQL (Programming Language): 51.6%
  • Communication: 45.4%
  • Management: 33.8%
  • Python (Programming Language): 31.2%
  • Tableau (Business Intelligence Software): 30.7%
  • Microsoft Excel: 28.3%
  • Dashboard: 27.9%

Business/Management:
  • Communication: 51.2%
  • Management: 39.2%
  • Leadership: 34.2%
  • Operations: 33.8%
  • Microsoft Excel: 32.0%
  • Problem Solving: 30.9%
  • Project Management: 29.1%
  • Presentations: 27.5%

Market Research:
  • Customer Relationship Management: 83.1%
  • Business Process: 45.8%
  • Communication: 41.5%
  • Business Requirements: 41.5%
  • Salesforce: 40.7%
  • Project Management: 40.7%
  • Sales: 39.8%
  • Problem Solving: 38.1%

How does Remote Work Type affect Salary?

Code
# Salary Analysis - Violin Plot by Remote Work Type and Occupation
df_remote = df_clean[df_clean['REMOTE_TYPE_NAME'] != 'Not Specified'].copy()

fig = px.violin(
    df_remote,
    x='REMOTE_TYPE_NAME',
    y='SALARY',
    color='LOT_V6_OCCUPATION_NAME',
    box=True,
    title='Salary Distribution by Remote Work Type and Occupation',
    labels={
        'REMOTE_TYPE_NAME': 'Remote Work Type',
        'SALARY': 'Annual Salary ($)',
        'LOT_V6_OCCUPATION_NAME': 'Occupation'
    }
)
fig.update_layout(
    template='plotly_white',
    height=500,
    legend=dict(orientation='h', yanchor='bottom', y=-0.3)
)
fig.write_image('figures/salary_by_remote.png', scale=2)
fig.show()

print("\nSalary Statistics by Remote Type:")
salary_stats = df_remote.groupby('REMOTE_TYPE_NAME')['SALARY'].agg(['count', 'median', 'mean', 'std']).round(0)
salary_stats.columns = ['Count', 'Median', 'Mean', 'Std Dev']
salary_stats['Median'] = salary_stats['Median'].apply(lambda x: f"${x:,.0f}")
salary_stats['Mean'] = salary_stats['Mean'].apply(lambda x: f"${x:,.0f}")
salary_stats['Std Dev'] = salary_stats['Std Dev'].apply(lambda x: f"${x:,.0f}")
print(salary_stats)

Salary Statistics by Remote Type:
                  Count   Median      Mean  Std Dev
REMOTE_TYPE_NAME                                   
Hybrid Remote      1482  $95,300  $104,846  $28,957
Not Remote          688  $97,250  $104,763  $27,970
Remote             7875  $98,800  $109,097  $29,321

Where can you apply for these Jobs?

Code
def extract_sources(source_series):
    all_sources = []
    for sources in source_series.dropna():
        if isinstance(sources, str):
            all_sources.extend([s.strip() for s in sources.split(',') if s.strip()])
    return Counter(all_sources)

source_counts = extract_sources(df_clean['SOURCE_TYPES'])

cleaned_counts = {}
for source, count in source_counts.items():
    if source == 'NONE' or source == '':
        continue
    elif source == 'Job intermediary':
        cleaned_counts['Recruiter'] = cleaned_counts.get('Recruiter', 0) + count
    elif source =='FreeJobBoard':
        cleaned_counts['Job Board'] = cleaned_counts.get('Job Board', 0) + count
    else:
        cleaned_counts[source] = cleaned_counts.get(source, 0) + count

top_sources = pd.DataFrame(list(cleaned_counts.items()), columns=['Source', 'Count'])
top_sources = top_sources.sort_values('Count', ascending=False).head(10).reset_index(drop=True)
top_sources['Percentage'] = (top_sources['Count'] / len(df_clean) * 100).round(1)

fig = px.treemap(
    top_sources,
    path=['Source'],
    values='Count',
    title='Job Posting Sources Distribution',
    color='Count',
    color_continuous_scale='Blues'
)
fig.update_layout(template='plotly_white', height=500)
fig.update_traces(textinfo='label+value+percent root', textfont_size=14)
fig.write_image('figures/source_types.png', scale=2)
fig.show()

print("\nJob Posting Sources:")
for i, row in top_sources.iterrows():
    print(f"  {i+1}. {row['Source']}: {row['Count']:,} ({row['Percentage']}%)")

Job Posting Sources:
  1. Job Board: 38,924 (83.3%)
  2. Company: 13,318 (28.5%)
  3. Recruiter: 2,791 (6.0%)
  4. Government: 862 (1.8%)
  5. Education: 521 (1.1%)

EDA Conclusion

Our exploratory analysis of 46,700+ job postings reveals key insights for data analytics professionals:

Industry & Companies: Professional Services, administrative support, waste management and Finance dominate hiring, with Deloitte, Accenture, pricewaterhouse and Insight Global leading recruitment—consulting firms as the primary employers.

Skills Demand: Each occupation has distinct skill requirements. Data Analysts prioritize SQL and Data Analysis, while Business Analysts focus on Communication and Project Management. Market Research emphasizes Marketing and Customer insights.

Compensation: Median salaries range $95K-$99K across remote types, with remote positions slightly higher. Occupation type impacts salary more than remote status.

Job Sources: 83% of postings appear on Job Boards, making platforms like LinkedIn and Indeed essential for job seekers. Direct company applications (28%) remain valuable.